################################################################################
## Group Identities and Parliamentary Debates: Replication package
## Fiva, Nedregård and Øien (2025)

# Description:

## Code to make Figure A3: "Empirical distribution of speech length"

################################################################################

# Packages

library(data.table)
library(lubridate)
library(dplyr)

# Directories (the parent directory is set by master.R)
dir      <-  "../data/2_processed_data"
fig.dir  <-  "../results/figures"
raw.dir  <-  "../data/1_raw_data"


# Data

## Data with minutes
NPD    <- fread(paste(raw.dir, "NPD.csv", sep = "/"))

# Data wrangling

# Lubridate does not work well with data.table, so I use dplyr
dt.min <- NPD |>
  arrange(id_speech) |>
  mutate(time = case_when(time == "" ~ NA_character_, T ~ time)) |>
  group_by(date) |>
  filter(!all(is.na(time))) |>
  ungroup()

dt.min <- dt.min |>
  mutate(datetime = as.POSIXct(paste(date, time), format = "%Y-%m-%d %H:%M:%S")) |>
  group_by(date) |>
  mutate(min := interval(datetime, lead(datetime))/dminutes(1)) 

dt <- dt.min |>
  mutate(min = case_when(min <= 0 ~ NA, # Nobody can talk for non-positive minues, set to missing
                         min > 12 ~ 12, T ~ min)) |> # truncating speeches above 12 minutes, see Figure note in paper
  filter(!is.na(min))


# Plotting

pdf(paste(fig.dir, "figA3.pdf", sep = "/"), width = 18, height = 15)

par(las = 1, mar=c(7, 7, 0, 0), mgp = c(5, 1, 0))

hist.sum <- hist(dt$min, breaks = seq(0, 12, 1/6), plot = F)

hist.sum$density <- with(hist.sum, density * diff(breaks)[1])

plot(hist.sum, freq = F, xlab = "Minutes of speech",  xlim = c(0,16), main = "", ylim = c(0,0.20), 
     xaxt = "n", col = "white", cex.axis = 2, cex.lab = 2)

axis(side=1, at=c(seq(0,12,1)), cex.axis = 2, cex.lab = 2)

dev.off()



